{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "import numpy as np\n",
    "import ast\n",
    "from pandas.io.json import json_normalize #package for flattening json in pandas df\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import mmap\n",
    "\n",
    "def get_num_lines(file_path):\n",
    "    fp = open(file_path, \"r+\")\n",
    "    buf = mmap.mmap(fp.fileno(), 0)\n",
    "    lines = 0\n",
    "    while buf.readline():\n",
    "        lines += 1\n",
    "    return lines"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### sample the data so that it looks like this:\n",
    "\n",
    "TEXT | product categories | rating given"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "root = \"/media/felipe/SAMSUNG/AmazonReviews/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews_df = pd.read_json(root+\"/sample_reviews_Books_5.json\",lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>asin</th>\n",
       "      <th>helpful</th>\n",
       "      <th>overall</th>\n",
       "      <th>reviewText</th>\n",
       "      <th>reviewTime</th>\n",
       "      <th>reviewerID</th>\n",
       "      <th>reviewerName</th>\n",
       "      <th>summary</th>\n",
       "      <th>unixReviewTime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0441019420</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>4</td>\n",
       "      <td>i'm a big Patricia Briggs fan. i love her stor...</td>\n",
       "      <td>11 7, 2010</td>\n",
       "      <td>A18O0N0QI055FU</td>\n",
       "      <td>Summer D. Olson \"fredomoftruth\"</td>\n",
       "      <td>great book</td>\n",
       "      <td>1289088000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         asin helpful  overall  \\\n",
       "0  0441019420  [0, 0]        4   \n",
       "\n",
       "                                          reviewText  reviewTime  \\\n",
       "0  i'm a big Patricia Briggs fan. i love her stor...  11 7, 2010   \n",
       "\n",
       "       reviewerID                     reviewerName     summary  unixReviewTime  \n",
       "0  A18O0N0QI055FU  Summer D. Olson \"fredomoftruth\"  great book      1289088000  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reviews_df.head(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews_df = reviews_df[['asin','overall','reviewText']]\n",
    "reviews_df.index = reviews_df['asin']\n",
    "reviews_df.drop(['asin'],axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews_df['categories'] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>overall</th>\n",
       "      <th>reviewText</th>\n",
       "      <th>categories</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>asin</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0441019420</th>\n",
       "      <td>4</td>\n",
       "      <td>i'm a big Patricia Briggs fan. i love her stor...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0316051632</th>\n",
       "      <td>5</td>\n",
       "      <td>The Disappearing Spoon by Sam KeanLittle, Brow...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>038573901X</th>\n",
       "      <td>2</td>\n",
       "      <td>I enjoy a great romance novel now and again, a...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>006144295X</th>\n",
       "      <td>1</td>\n",
       "      <td>I've read all of Kimberla Lawson-Roby novels. ...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0316097543</th>\n",
       "      <td>3</td>\n",
       "      <td>This book almost made it.  The jump from starv...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            overall                                         reviewText  \\\n",
       "asin                                                                     \n",
       "0441019420        4  i'm a big Patricia Briggs fan. i love her stor...   \n",
       "0316051632        5  The Disappearing Spoon by Sam KeanLittle, Brow...   \n",
       "038573901X        2  I enjoy a great romance novel now and again, a...   \n",
       "006144295X        1  I've read all of Kimberla Lawson-Roby novels. ...   \n",
       "0316097543        3  This book almost made it.  The jump from starv...   \n",
       "\n",
       "            categories  \n",
       "asin                    \n",
       "0441019420         NaN  \n",
       "0316051632         NaN  \n",
       "038573901X         NaN  \n",
       "006144295X         NaN  \n",
       "0316097543         NaN  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reviews_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 7736/9430088 [00:39<13:21:05, 196.03it/s]"
     ]
    }
   ],
   "source": [
    "with open(root+\"/metadata.json\") as f:\n",
    "    for line in tqdm(f, total=get_num_lines(root+\"/metadata.json\")):\n",
    "        json_data = ast.literal_eval(line)\n",
    "        other_df = json_normalize(json_data)\n",
    "        other_df['asin'] = other_df['asin'].astype('object')\n",
    "        other_df.index = other_df['asin']\n",
    "        other_df.drop(['asin'],axis=1,inplace=True)\n",
    "               \n",
    "        if not 'categories' in other_df.columns.values:\n",
    "            other_df['categories'] = ''\n",
    "           \n",
    "        reviews_df.update(other_df)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_metadata_df = pd.read_json(root+\"/sample_metadata.json\",lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Global TF Kernel (Python 3)",
   "language": "python",
   "name": "global-tf-python-3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
